Section 1
suppressWarnings(library(data.table))
suppressWarnings(library(ggplot2))
suppressWarnings(library(plotly))
## Note: the specification for S3 class "AsIs" in package 'DBI' seems equivalent to one from package 'jsonlite': not turning on duplicate class definitions for this class.
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
rawData = read.csv("C:/Users/Matthew/Documents/SunshineList/Inputs/2015-combined-salary-seconded-en.csv", stringsAsFactors = F)
rawData[, 4] = as.numeric(gsub("[$,]","", rawData[, 4]))
rawData_universities = rawData[rawData$Sector == "Universities", ]
for (i in (1:length(unique(rawData_universities$Employer)))){
#print(unique(rawData_universities$Employer)[i])
filtered_df = rawData_universities[rawData_universities$Employer == unique(rawData_universities$Employer)[i], ]
hist(filtered_df$Salary.Paid)
}



































# boxplot(x = rawData_universities$Salary.Paid, data = filtered_df, formula = rawData_universities$Salary.Paid ~ rawData_universities$Employer)
top_universities = c("University of Toronto", "University of Ottawa", "University of Waterloo", "University of Western Ontario")
set1 = unique(rawData_universities$Employer)[1:12]
set2 = unique(rawData_universities$Employer)[13:25]
set3 = unique(rawData_universities$Employer)[25:55]
# Find the distribution of salaries across schools
rawData_universities_split_1 = subset(rawData_universities, Employer %in% set1)
rawData_universities_split_2 = subset(rawData_universities, Employer %in% set2)
rawData_universities_split_3 = subset(rawData_universities, Employer %in% set3)
boxplot(Salary.Paid ~ Employer, data = rawData_universities, las=2)

boxplot(Salary.Paid ~ Employer, data = rawData_universities_split_1, las=2)

boxplot(Salary.Paid ~ Employer, data = rawData_universities_split_2, las=2)

boxplot(Salary.Paid ~ Employer, data = rawData_universities_split_3, las=2)

ggplot(rawData_universities, aes(x=factor(Employer), y=Salary.Paid)) + geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5))

ggplot(rawData_universities, aes(x=factor(Employer), y=Salary.Paid)) + geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5)) + ylim(0,500000)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

rawData_universities$City = NA
# Find the distribution of salaries vs roles (for Brock, e.g.)
rawData_universities_Brock = rawData_universities[rawData_universities$Employer == "Brock University", ]
#table(rawData_universities_Brock$Job.title)
plot(table(rawData_universities_Brock$Job.title))

ggplot(rawData_universities_Brock, aes(x=factor(Job.title), y=Salary.Paid)) + geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5)) + ylim(0,500000)

rawData_universities_Brock_noDup = rawData_universities_Brock[(duplicated(rawData_universities_Brock$Job.title) | duplicated(rawData_universities_Brock$Job.title, fromLast = TRUE)), ]
rawData_universities_Brock_noDup$Job.title = with(rawData_universities_Brock_noDup, reorder(Job.title, Salary.Paid, median))
ggplot(rawData_universities_Brock_noDup, aes(x=factor(Job.title), y=Salary.Paid)) + geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5)) + ylim(0,500000)

# Create the table condensing down to school
table(rawData_universities$Employer)
##
## Algoma University
## 43
## Brescia University College
## 25
## Brock University
## 583
## Carleton University
## 778
## Huntington University
## 8
## Huron University College
## 30
## King's University College
## 88
## Lakehead University
## 300
## Laurentian University of Sudbury
## 385
## McMaster Divinity College
## 3
## McMaster University
## 1137
## Nipissing University
## 104
## Northern Ontario School of Medicine
## 37
## Ontario College of Art & Design University
## 108
## Queen's University
## 995
## Ryerson University
## 1046
## Saint Paul University / Université Saint-Paul
## 16
## St. Jerome's University
## 24
## St. Peter's Seminary
## 2
## Thorneloe University
## 7
## Trent University
## 249
## Trinity College
## 14
## Université de Hearst
## 4
## University of Guelph
## 830
## University of Ontario Institute of Technology
## 194
## University of Ottawa
## 1400
## University of St. Michael's College
## 20
## University of Sudbury
## 9
## University of Toronto
## 3288
## University of Waterloo
## 1295
## University of Western Ontario
## 1298
## University of Windsor
## 546
## Victoria University
## 34
## Wilfrid Laurier University
## 556
## York University
## 1609
summary_university = setDT(rawData_universities)[, list(Median.Salary = median(Salary.Paid)), by=Employer]
# Refactor next line by putting city in whole dataframe, then condensing down
summary_university$City = c("Sault Ste Marie", "London", "Ste Catharines", "Ottawa", "Sudbury", "London", "London", "Thunder Bay", "Sudbury", "Hamilton",
"Hamilton", "North Bay", "Sudbury", "Toronto", "Kingston", "Toronto", "Ottawa", "Waterloo", "London", "Sudbury", "Peterborough",
"Toronto", "Hearst", "Ottawa", "Guelph", "Oshawa", "Toronto","Sudbury", "Toronto", "Waterloo", "London", "Windsor", "Toronto", "Waterloo",
"Toronto")
summary_city = summary_university[, list(Average.Salary = mean(Median.Salary)), by=City]
barplot(height = summary_city$Average.Salary, names.arg = summary_city$City, las=2)

rawData_universities_dt = data.table(rawData_universities)
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Toronto" |
rawData_universities_dt$Employer == "York University" |
rawData_universities_dt$Employer == "Ryerson University" |
rawData_universities_dt$Employer == "University of St. Michael's College" |
rawData_universities_dt$Employer == "Victoria University" |
rawData_universities_dt$Employer == "Ontario College of Art & Design University" |
rawData_universities_dt$Employer == "Trinity College"] = "Toronto"
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Ottawa" |
rawData_universities_dt$Employer == "Carleton University" |
rawData_universities_dt$Employer == "Saint Paul University / Université Saint-Paul"] = "Ottawa"
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Western Ontario" |
rawData_universities_dt$Employer == "Brescia University College" |
rawData_universities_dt$Employer == "Huron University College" |
rawData_universities_dt$Employer == "King's University College" |
rawData_universities_dt$Employer == "St. Peter's Seminary"] = "London"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Algoma University"] = "Sault Ste Marie"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Brock University"] = "Ste Catharines"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Huntington University" |
rawData_universities_dt$Employer == "Laurentian University of Sudbury" |
rawData_universities_dt$Employer == "Northern Ontario School of Medicine" |
rawData_universities_dt$Employer == "Thorneloe University" |
rawData_universities_dt$Employer == "University of Sudbury"] = "Sudbury"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Lakehead University"] = "Thunder Bay"
rawData_universities_dt$City[rawData_universities_dt$Employer == "McMaster University" |
rawData_universities_dt$Employer == "McMaster Divinity College"] = "Hamilton"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Nipissing University"] = "North Bay"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Queen's University"] = "Kingston"
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Guelph"] = "Guelph"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Université de Hearst"] = "Hearst"
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Ontario Institute of Technology"] = "Oshawa"
rawData_universities_dt$City[rawData_universities_dt$Employer == "Trent University"] = "Peterborough"
rawData_universities_dt$City[rawData_universities_dt$Employer == "St. Jerome's University" |
rawData_universities_dt$Employer == "University of Waterloo" |
rawData_universities_dt$Employer == "Wilfrid Laurier University"] = "Waterloo"
rawData_universities_dt$City[rawData_universities_dt$Employer == "University of Windsor"] = "Windsor"
summary_city = rawData_universities_dt[, list(Median.Salary = median(Salary.Paid)), by=City]
summary_city = summary_city[order(Median.Salary)]
barplot(height = summary_city$Median.Salary, names.arg = summary_city$City, las=2, ylab = "Median Salary for City ($)")

rawData_universities_dt$City = with(rawData_universities_dt, reorder(City, Salary.Paid, median))
city_vs_salary_plot = ggplot(rawData_universities_dt, aes(x=factor(City), y=Salary.Paid))+ geom_boxplot() + theme(axis.text.x = element_text(angle=90, vjust=0.5)) + ylim(0,500000)
ggplotly(city_vs_salary_plot)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
# Find the cost of living for city and plot against median salary for university
# Find the tuition for school and plot against median salary for university
# Find the school's ranking and plot against median salary for university